Blame - Lib/tokenize.py - platform/external/python/cpython3

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

3

tokenize(readline) is a generator that breaks a stream of bytes into

4

Python tokens. It decodes the bytes according to PEP-0263 for

5

determining source file encoding.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

6

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

7

It accepts a readline-like method which is called repeatedly to get the

8

next line of input (or b"" for EOF). It generates 5-tuples with these

9

members:

Tim Peters

4efb6e9

2001-06-29 23:51:08 +0000

[diff] [blame]

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

19

operators. Additionally, all token lists start with an ENCODING token

20

which tells you which encoding was used to decode the bytes stream.

21

"""

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

22

Ka-Ping Yee

244c593

2001-03-01 13:56:40 +0000

[diff] [blame]

23

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

24

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

25

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

26

'Michael Foord')

Serhiy Storchaka

cf4a2f2

2015-03-11 17:18:03 +0200

[diff] [blame]

27

from builtins import open as _builtin_open

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

28

from codecs import lookup, BOM_UTF8

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

29

import collections

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

30

from io import TextIOWrapper

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

31

from itertools import chain

import re

import sys

from token import *

Serhiy Storchaka

2013-09-16 23:51:56 +0300

[diff] [blame]

36

cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

37

blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

38

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

39

import token

Alexander Belopolsky

b9d10d0

2010-11-11 14:07:41 +0000

[diff] [blame]

40

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

41

"NL", "untokenize", "ENCODING", "TokenInfo"]

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

42

del token

43

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

44

COMMENT = N_TOKENS

45

tok_name[COMMENT] = 'COMMENT'

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

46

NL = N_TOKENS + 1

47

tok_name[NL] = 'NL'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

48

ENCODING = N_TOKENS + 2

49

tok_name[ENCODING] = 'ENCODING'

50

N_TOKENS += 3

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

51

EXACT_TOKEN_TYPES = {

'(': LPAR,

')': RPAR,

'[': LSQB,

']': RSQB,

':': COLON,

',': COMMA,

';': SEMI,

'+': PLUS,

'-': MINUS,

'*': STAR,

'/': SLASH,

'|': VBAR,

'&': AMPER,

'<': LESS,

'>': GREATER,

'=': EQUAL,

'.': DOT,

'%': PERCENT,

'{': LBRACE,

'}': RBRACE,

'==': EQEQUAL,

'!=': NOTEQUAL,

'<=': LESSEQUAL,

'>=': GREATEREQUAL,

'~': TILDE,

'^': CIRCUMFLEX,

'<<': LEFTSHIFT,

'>>': RIGHTSHIFT,

'**': DOUBLESTAR,

'+=': PLUSEQUAL,

'-=': MINEQUAL,

'*=': STAREQUAL,

'/=': SLASHEQUAL,

'%=': PERCENTEQUAL,

'&=': AMPEREQUAL,

'|=': VBAREQUAL,

'^=': CIRCUMFLEXEQUAL,

89

'<<=': LEFTSHIFTEQUAL,

90

'>>=': RIGHTSHIFTEQUAL,

91

'**=': DOUBLESTAREQUAL,

92

'//': DOUBLESLASH,

93

'//=': DOUBLESLASHEQUAL,

Benjamin Peterson

d51374e

2014-04-09 23:55:56 -0400

[diff] [blame]

94

'@': AT,

95

'@=': ATEQUAL,

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

96

}

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

97

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

98

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

99

def __repr__(self):

Raymond Hettinger

a0e7940

2010-09-09 08:29:05 +0000

[diff] [blame]

100

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

101

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

102

self._replace(type=annotated_type))

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

103

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

104

@property

105

def exact_type(self):

106

if self.type == OP and self.string in EXACT_TOKEN_TYPES:

107

return EXACT_TOKEN_TYPES[self.string]

else:

return self.type

Eric S. Raymond

2001-02-09 11:10:16 +0000

[diff] [blame]

111

def group(*choices): return '(' + '|'.join(choices) + ')'

Guido van Rossum

68468eb

2003-02-27 20:14:51 +0000

[diff] [blame]

112

def any(*choices): return group(*choices) + '*'

113

def maybe(*choices): return group(*choices) + '?'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

114

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

115

# Note: we use unicode matching for names ("\w") but ascii matching for

116

# number literals.

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

117

Whitespace = r'[ \f\t]*'

118

Comment = r'#[^\r\n]*'

119

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

120

Name = r'\w+'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

121

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

122

Hexnumber = r'0[xX][0-9a-fA-F]+'

Georg Brandl

fceab5a

2008-01-19 20:08:23 +0000

[diff] [blame]

123

Binnumber = r'0[bB][01]+'

124

Octnumber = r'0[oO][0-7]+'

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

125

Decnumber = r'(?:0+|[1-9][0-9]*)'

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

126

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

127

Exponent = r'[eE][-+]?[0-9]+'

128

Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

129

Expfloat = r'[0-9]+' + Exponent

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

130

Floatnumber = group(Pointfloat, Expfloat)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

131

Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

132

Number = group(Imagnumber, Floatnumber, Intnumber)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

133

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

134

StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

135

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

136

# Tail end of ' string.

137

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

138

# Tail end of " string.

139

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

140

# Tail end of ''' string.

141

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

142

# Tail end of """ string.

143

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

144

Triple = group(StringPrefix + "'''", StringPrefix + '"""')

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

145

# Single-line ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

146

String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

147

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

148

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

149

# Because of leftmost-then-longest match semantics, be sure to put the

150

# longest operators first (e.g., if = came before ==, == would get

151

# recognized as two instances of =).

Guido van Rossum

b053cd8

2006-08-24 03:53:23 +0000

[diff] [blame]

152

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

153

r"//=?", r"->",

Benjamin Peterson

d51374e

2014-04-09 23:55:56 -0400

[diff] [blame]

154

r"[+\-*/%&@|^=<>]=?",

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

155

r"~")

Thomas Wouters

e1519a1

2000-08-24 21:44:52 +0000

[diff] [blame]

156

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

157

Bracket = '[][(){}]'

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

158

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

159

Funny = group(Operator, Bracket, Special)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

160

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

161

PlainToken = group(Number, Funny, String, Name)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

162

Token = Ignore + PlainToken

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

163

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

164

# First (or only) line of ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

165

ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

166

group("'", r'\\\r?\n'),

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

167

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

168

group('"', r'\\\r?\n'))

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

169

PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

170

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

171

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

172

def _compile(expr):

173

return re.compile(expr, re.UNICODE)

174

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

175

endpats = {"'": Single, '"': Double,

176

"'''": Single3, '"""': Double3,

177

"r'''": Single3, 'r"""': Double3,

178

"b'''": Single3, 'b"""': Double3,

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

179

"R'''": Single3, 'R"""': Double3,

180

"B'''": Single3, 'B"""': Double3,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

181

"br'''": Single3, 'br"""': Double3,

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

182

"bR'''": Single3, 'bR"""': Double3,

183

"Br'''": Single3, 'Br"""': Double3,

184

"BR'''": Single3, 'BR"""': Double3,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

185

"rb'''": Single3, 'rb"""': Double3,

186

"Rb'''": Single3, 'Rb"""': Double3,

187

"rB'''": Single3, 'rB"""': Double3,

188

"RB'''": Single3, 'RB"""': Double3,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

189

"u'''": Single3, 'u"""': Double3,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

190

"U'''": Single3, 'U"""': Double3,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

191

'r': None, 'R': None, 'b': None, 'B': None,

192

'u': None, 'U': None}

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

193

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

194

triple_quoted = {}

195

for t in ("'''", '"""',

196

"r'''", 'r"""', "R'''", 'R"""',

Guido van Rossum

4fe72f9

2007-11-12 17:40:10 +0000

[diff] [blame]

197

"b'''", 'b"""', "B'''", 'B"""',

198

"br'''", 'br"""', "Br'''", 'Br"""',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

199

"bR'''", 'bR"""', "BR'''", 'BR"""',

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

200

"rb'''", 'rb"""', "rB'''", 'rB"""',

201

"Rb'''", 'Rb"""', "RB'''", 'RB"""',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

202

"u'''", 'u"""', "U'''", 'U"""',

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

203

):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

triple_quoted[t] = t

single_quoted = {}

for t in ("'", '"',

"r'", 'r"', "R'", 'R"',

Guido van Rossum

4fe72f9

2007-11-12 17:40:10 +0000

[diff] [blame]

208

"b'", 'b"', "B'", 'B"',

209

"br'", 'br"', "Br'", 'Br"',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

210

"bR'", 'bR"', "BR'", 'BR"' ,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

211

"rb'", 'rb"', "rB'", 'rB"',

212

"Rb'", 'Rb"', "RB'", 'RB"' ,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

213

"u'", 'u"', "U'", 'U"',

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

214

):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

215

single_quoted[t] = t

216

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

217

tabsize = 8

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

218

Ka-Ping Yee

28c62bb

2001-03-23 05:22:49 +0000

[diff] [blame]

219

class TokenError(Exception): pass

220

221

class StopTokenizing(Exception): pass

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

222

Tim Peters

5ca576e

2001-06-18 22:08:13 +0000

[diff] [blame]

223

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

230

self.encoding = None

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

231

232

def add_whitespace(self, start):

233

row, col = start

Terry Jan Reedy

5e6db31

2014-02-17 16:45:48 -0500

[diff] [blame]

234

if row < self.prev_row or row == self.prev_row and col < self.prev_col:

235

raise ValueError("start ({},{}) precedes previous end ({},{})"

236

.format(row, col, self.prev_row, self.prev_col))

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

237

row_offset = row - self.prev_row

Terry Jan Reedy

f106f8f

2014-02-23 23:39:57 -0500

[diff] [blame]

238

if row_offset:

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

239

self.tokens.append("\\\n" * row_offset)

240

self.prev_col = 0

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

241

col_offset = col - self.prev_col

242

if col_offset:

243

self.tokens.append(" " * col_offset)

244

245

def untokenize(self, iterable):

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

246

it = iter(iterable)

Dingyuan Wang

e411b66

2015-06-22 10:01:12 +0800

[diff] [blame]

247

indents = []

248

startline = False

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

249

for t in it:

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

250

if len(t) == 2:

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

251

self.compat(t, it)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

252

break

253

tok_type, token, start, end, line = t

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

254

if tok_type == ENCODING:

255

self.encoding = token

256

continue

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

257

if tok_type == ENDMARKER:

258

break

Dingyuan Wang

e411b66

2015-06-22 10:01:12 +0800

[diff] [blame]

259

if tok_type == INDENT:

260

indents.append(token)

261

continue

262

elif tok_type == DEDENT:

263

indents.pop()

264

self.prev_row, self.prev_col = end

265

continue

266

elif tok_type in (NEWLINE, NL):

267

startline = True

268

elif startline and indents:

269

indent = indents[-1]

270

if start[1] >= len(indent):

271

self.tokens.append(indent)

272

self.prev_col = len(indent)

273

startline = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

274

self.add_whitespace(start)

275

self.tokens.append(token)

276

self.prev_row, self.prev_col = end

277

if tok_type in (NEWLINE, NL):

278

self.prev_row += 1

279

self.prev_col = 0

280

return "".join(self.tokens)

281

282

def compat(self, token, iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

283

indents = []

284

toks_append = self.tokens.append

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

285

startline = token[0] in (NEWLINE, NL)

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

286

prevstring = False

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

287

288

for tok in chain([token], iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

289

toknum, tokval = tok[:2]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

290

if toknum == ENCODING:

291

self.encoding = tokval

292

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

293

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

294

if toknum in (NAME, NUMBER, ASYNC, AWAIT):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

295

tokval += ' '

296

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

297

# Insert a space between two consecutive strings

298

if toknum == STRING:

299

if prevstring:

300

tokval = ' ' + tokval

prevstring = True

else:

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

305

if toknum == INDENT:

306

indents.append(tokval)

307

continue

308

elif toknum == DEDENT:

309

indents.pop()

310

continue

311

elif toknum in (NEWLINE, NL):

312

startline = True

313

elif startline and indents:

314

toks_append(indents[-1])

315

startline = False

316

toks_append(tokval)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

317

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

318

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

319

def untokenize(iterable):

320

"""Transform tokens back into Python source code.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

321

It returns a bytes object, encoded using the ENCODING

322

token, which is the first token sequence output by tokenize.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

323

324

Each element returned by the iterable must be a token sequence

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

325

with at least two elements, a token number and token value. If

326

only two tokens are passed, the resulting output is poor.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

327

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

328

Round-trip invariant for full input:

329

Untokenized source will match input source exactly

330

331

Round-trip invariant for limited intput:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

332

# Output bytes will tokenize the back to the input

333

t1 = [tok[:2] for tok in tokenize(f.readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

334

newcode = untokenize(t1)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

335

readline = BytesIO(newcode).readline

336

t2 = [tok[:2] for tok in tokenize(readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

337

assert t1 == t2

338

"""

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

339

ut = Untokenizer()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

340

out = ut.untokenize(iterable)

341

if ut.encoding is not None:

342

out = out.encode(ut.encoding)

343

return out

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

344

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

345

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

346

def _get_normal_name(orig_enc):

347

"""Imitates get_normal_name in tokenizer.c."""

348

# Only care about the first 12 characters.

349

enc = orig_enc[:12].lower().replace("_", "-")

350

if enc == "utf-8" or enc.startswith("utf-8-"):

351

return "utf-8"

352

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

353

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

357

def detect_encoding(readline):

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

358

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

359

The detect_encoding() function is used to detect the encoding that should

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

360

be used to decode a Python source file. It requires one argument, readline,

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

361

in the same way as the tokenize() generator.

362

363

It will call readline a maximum of twice, and return the encoding used

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

364

(as a string) and a list of any lines (left as bytes) it has read in.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

365

366

It detects the encoding from the presence of a utf-8 bom or an encoding

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

367

cookie as specified in pep-0263. If both a bom and a cookie are present,

368

but disagree, a SyntaxError will be raised. If the encoding cookie is an

369

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

370

'utf-8-sig' is returned.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

371

372

If no encoding is specified, then the default of 'utf-8' will be returned.

373

"""

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

374

try:

375

filename = readline.__self__.name

376

except AttributeError:

377

filename = None

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

378

bom_found = False

379

encoding = None

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

380

default = 'utf-8'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

385

return b''

386

387

def find_cookie(line):

388

try:

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

389

# Decode as UTF-8. Either the line is an encoding declaration,

390

# in which case it should be pure ASCII, or it must be UTF-8

391

# per default encoding.

392

line_string = line.decode('utf-8')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

393

except UnicodeDecodeError:

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

394

msg = "invalid or missing encoding declaration"

395

if filename is not None:

396

msg = '{} for {!r}'.format(msg, filename)

397

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

398

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

399

match = cookie_re.match(line_string)

400

if not match:

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

401

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

402

encoding = _get_normal_name(match.group(1))

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

403

try:

404

codec = lookup(encoding)

405

except LookupError:

406

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

407

if filename is None:

408

msg = "unknown encoding: " + encoding

409

else:

410

msg = "unknown encoding for {!r}: {}".format(filename,

411

encoding)

412

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

413

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

414

if bom_found:

Florent Xicluna

11f0b41

2012-07-07 12:13:35 +0200

[diff] [blame]

415

if encoding != 'utf-8':

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

416

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

417

if filename is None:

418

msg = 'encoding problem: utf-8'

419

else:

420

msg = 'encoding problem for {!r}: utf-8'.format(filename)

421

raise SyntaxError(msg)

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

422

encoding += '-sig'

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

423

return encoding

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

424

425

first = read_or_stop()

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

426

if first.startswith(BOM_UTF8):

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

427

bom_found = True

428

first = first[3:]

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

429

default = 'utf-8-sig'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

430

if not first:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

431

return default, []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

432

433

encoding = find_cookie(first)

434

if encoding:

435

return encoding, [first]

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

436

if not blank_re.match(first):

437

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

438

439

second = read_or_stop()

440

if not second:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

441

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

442

443

encoding = find_cookie(second)

444

if encoding:

445

return encoding, [first, second]

446

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

447

return default, [first, second]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

448

449

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

450

def open(filename):

451

"""Open a file in read only mode using the encoding detected by

452

detect_encoding().

453

"""

Victor Stinner

9691750

2014-12-05 10:17:10 +0100

[diff] [blame]

454

buffer = _builtin_open(filename, 'rb')

Victor Stinner

387729e

2015-05-26 00:43:58 +0200

[diff] [blame]

455

try:

456

encoding, lines = detect_encoding(buffer.readline)

457

buffer.seek(0)

458

text = TextIOWrapper(buffer, encoding, line_buffering=True)

text.mode = 'r'

return text

except:

buffer.close()

raise

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

464

465

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

466

def tokenize(readline):

467

"""

468

The tokenize() generator requires one argment, readline, which

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

469

must be a callable object which provides the same interface as the

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

470

readline() method of built-in file objects. Each call to the function

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

471

should return one line of input as bytes. Alternately, readline

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

472

can be a callable function terminating with StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

473

readline = open(myfile, 'rb').__next__ # Example of alternate readline

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

474

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

475

The generator produces 5-tuples with these members: the token type; the

476

token string; a 2-tuple (srow, scol) of ints specifying the row and

477

column where the token begins in the source; a 2-tuple (erow, ecol) of

478

ints specifying the row and column where the token ends in the source;

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

479

and the line on which the token was found. The line passed is the

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

480

logical line; continuation lines are included.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

481

482

The first token sequence will always be an ENCODING token

483

which tells you which encoding was used to decode the bytes stream.

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

484

"""

Benjamin Peterson

21db77e

2009-11-14 16:27:26 +0000

[diff] [blame]

485

# This import is here to avoid problems when the itertools module is not

486

# built yet and tokenize is imported.

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

487

from itertools import chain, repeat

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

488

encoding, consumed = detect_encoding(readline)

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

489

rl_gen = iter(readline, b"")

490

empty = repeat(b"")

491

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

492

493

494

def _tokenize(readline, encoding):

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

495

lnum = parenlev = continued = 0

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

496

numchars = '0123456789'

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

497

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

498

contline = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

499

indents = [0]

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

500

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame^]

501

# 'stashed' and 'async_*' are used for async/await parsing

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

502

stashed = None

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame^]

503

async_def = False

504

async_def_indent = 0

505

async_def_nl = False

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

506

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

507

if encoding is not None:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

508

if encoding == "utf-8-sig":

509

# BOM will already have been stripped.

510

encoding = "utf-8"

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

511

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

Benjamin Peterson

0fe1438

2008-06-05 23:07:42 +0000

[diff] [blame]

512

while True: # loop over lines in stream

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

513

try:

514

line = readline()

515

except StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

516

line = b''

517

518

if encoding is not None:

519

line = line.decode(encoding)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

520

lnum += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

521

pos, max = 0, len(line)

522

523

if contstr: # continued string

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

524

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

525

raise TokenError("EOF in multi-line string", strstart)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

526

endmatch = endprog.match(line)

527

if endmatch:

528

pos = end = endmatch.end(0)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

529

yield TokenInfo(STRING, contstr + line[:end],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

530

strstart, (lnum, end), contline + line)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

531

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

532

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

533

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

534

yield TokenInfo(ERRORTOKEN, contstr + line,

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

535

strstart, (lnum, len(line)), contline)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

536

contstr = ''

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

537

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

538

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

539

else:

540

contstr = contstr + line

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

541

contline = contline + line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

542

continue

543

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

544

elif parenlev == 0 and not continued: # new statement

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

545

if not line: break

546

column = 0

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

547

while pos < max: # measure leading whitespace

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

548

if line[pos] == ' ':

549

column += 1

550

elif line[pos] == '\t':

551

column = (column//tabsize + 1)*tabsize

552

elif line[pos] == '\f':

column = 0

else:

break

pos += 1

if pos == max:

break

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

559

560

if line[pos] in '#\r\n': # skip comments or blank lines

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

561

if line[pos] == '#':

562

comment_token = line[pos:].rstrip('\r\n')

563

nl_pos = pos + len(comment_token)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

564

yield TokenInfo(COMMENT, comment_token,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

565

(lnum, pos), (lnum, pos + len(comment_token)), line)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

566

yield TokenInfo(NL, line[nl_pos:],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

567

(lnum, nl_pos), (lnum, len(line)), line)

568

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

569

yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

570

(lnum, pos), (lnum, len(line)), line)

571

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

572

573

if column > indents[-1]: # count indents or dedents

574

indents.append(column)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

575

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

576

while column < indents[-1]:

Raymond Hettinger

da99d1c

2005-06-21 07:43:58 +0000

[diff] [blame]

577

if column not in indents:

578

raise IndentationError(

Thomas Wouters

00ee7ba

2006-08-21 19:07:27 +0000

[diff] [blame]

579

"unindent does not match any outer indentation level",

580

("<tokenize>", lnum, pos, line))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

581

indents = indents[:-1]

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

582

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame^]

583

if async_def and async_def_indent >= indents[-1]:

584

async_def = False

585

async_def_nl = False

586

async_def_indent = 0

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

587

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

588

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

589

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame^]

590

if async_def and async_def_nl and async_def_indent >= indents[-1]:

async_def = False

async_def_nl = False

async_def_indent = 0

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

595

else: # continued statement

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

596

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

597

raise TokenError("EOF in multi-line statement", (lnum, 0))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

598

continued = 0

599

600

while pos < max:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

601

pseudomatch = _compile(PseudoToken).match(line, pos)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

602

if pseudomatch: # scan for tokens

603

start, end = pseudomatch.span(1)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

604

spos, epos, pos = (lnum, start), (lnum, end), end

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

605

if start == end:

606

continue

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

607

token, initial = line[start:end], line[start]

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

608

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

609

if (initial in numchars or # ordinary number

610

(initial == '.' and token != '.' and token != '...')):

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

611

yield TokenInfo(NUMBER, token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

612

elif initial in '\r\n':

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

613

if stashed:

614

yield stashed

615

stashed = None

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame^]

616

if parenlev > 0:

617

yield TokenInfo(NL, token, spos, epos, line)

618

else:

619

yield TokenInfo(NEWLINE, token, spos, epos, line)

if async_def:

async_def_nl = True

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

623

elif initial == '#':

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

624

assert not token.endswith("\n")

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

625

if stashed:

626

yield stashed

627

stashed = None

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

628

yield TokenInfo(COMMENT, token, spos, epos, line)

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

629

elif token in triple_quoted:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

630

endprog = _compile(endpats[token])

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

631

endmatch = endprog.match(line, pos)

632

if endmatch: # all on one line

633

pos = endmatch.end(0)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

634

token = line[start:pos]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

635

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

636

else:

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

637

strstart = (lnum, start) # multiple lines

638

contstr = line[start:]

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

639

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

640

break

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

641

elif initial in single_quoted or \

642

token[:2] in single_quoted or \

643

token[:3] in single_quoted:

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

644

if token[-1] == '\n': # continued string

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

645

strstart = (lnum, start)

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

646

endprog = _compile(endpats[initial] or

647

endpats[token[1]] or

648

endpats[token[2]])

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

649

contstr, needcont = line[start:], 1

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

650

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

651

break

652

else: # ordinary string

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

653

yield TokenInfo(STRING, token, spos, epos, line)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

654

elif initial.isidentifier(): # ordinary name

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

655

if token in ('async', 'await'):

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame^]

656

if async_def:

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

657

yield TokenInfo(

658

ASYNC if token == 'async' else AWAIT,

659

token, spos, epos, line)

660

continue

661

662

tok = TokenInfo(NAME, token, spos, epos, line)

663

if token == 'async' and not stashed:

stashed = tok

continue

if token == 'def':

if (stashed

and stashed.type == NAME

670

and stashed.string == 'async'):

671

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame^]

672

async_def = True

673

async_def_indent = indents[-1]

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

674

675

yield TokenInfo(ASYNC, stashed.string,

676

stashed.start, stashed.end,

677

stashed.line)

678

stashed = None

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

if stashed:

yield stashed

stashed = None

yield tok

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

685

elif initial == '\\': # continued stmt

686

continued = 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

687

else:

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

688

if initial in '([{':

689

parenlev += 1

690

elif initial in ')]}':

691

parenlev -= 1

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

692

if stashed:

693

yield stashed

694

stashed = None

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

695

yield TokenInfo(OP, token, spos, epos, line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

696

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

697

yield TokenInfo(ERRORTOKEN, line[pos],

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

698

(lnum, pos), (lnum, pos+1), line)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

699

pos += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

700

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

if stashed:

yield stashed

stashed = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

705

for indent in indents[1:]: # pop remaining indent levels

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

706

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

707

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

708

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

709

710

# An undocumented, backwards compatible, API for all the places in the standard

711

# library that expect to be able to use tokenize with strings

712

def generate_tokens(readline):

713

return _tokenize(readline, None)

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

714

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

def main():

import argparse

# Helper error handling routines

719

def perror(message):

720

print(message, file=sys.stderr)

721

722

def error(message, filename=None, location=None):

723

if location:

724

args = (filename,) + location + (message,)

725

perror("%s:%d:%d: error: %s" % args)

726

elif filename:

727

perror("%s: error: %s" % (filename, message))

728

else:

729

perror("error: %s" % message)

730

sys.exit(1)

731

732

# Parse the arguments and options

733

parser = argparse.ArgumentParser(prog='python -m tokenize')

734

parser.add_argument(dest='filename', nargs='?',

735

metavar='filename.py',

736

help='the file to tokenize; defaults to stdin')

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

737

parser.add_argument('-e', '--exact', dest='exact', action='store_true',

738

help='display token names using the exact type')

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

739

args = parser.parse_args()

try:

# Tokenize the input

if args.filename:

filename = args.filename

Victor Stinner

9691750

2014-12-05 10:17:10 +0100

[diff] [blame]

745

with _builtin_open(filename, 'rb') as f:

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

746

tokens = list(tokenize(f.readline))

747

else:

748

filename = "<stdin>"

749

tokens = _tokenize(sys.stdin.readline, None)

750

751

# Output the tokenization

752

for token in tokens:

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

753

token_type = token.type

754

if args.exact:

755

token_type = token.exact_type

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

756

token_range = "%d,%d-%d,%d:" % (token.start + token.end)

757

print("%-20s%-15s%-15r" %

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

758

(token_range, tok_name[token_type], token.string))

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

759

except IndentationError as err:

760

line, column = err.args[1][1:3]

761

error(err.args[0], filename, (line, column))

762

except TokenError as err:

763

line, column = err.args[1]

764

error(err.args[0], filename, (line, column))

765

except SyntaxError as err:

766

error(err, filename)

Andrew Svetlov

f7a17b4

2012-12-25 16:47:37 +0200

[diff] [blame]

767

except OSError as err:

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

768

error(err)

769

except KeyboardInterrupt:

770

print("interrupted\n")

771

except Exception as err:

772

perror("unexpected error: %s" % err)

773

raise

774

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

775

if __name__ == "__main__":

Meador Inge