Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython3

2008-03-19 05:04:44 +0000

[diff] [blame]

"""Tokenization help for Python programs.

5

6

generate_tokens(readline) is a generator that breaks a stream of

7

text into Python tokens. It accepts a readline-like method which is called

8

repeatedly to get the next line of input (or "" for EOF). It generates

9

5-tuples with these members:

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

operators

Older entry points

tokenize_loop(readline, tokeneater)

23

tokenize(readline, tokeneater=printtoken)

24

are the same, except instead of generating tokens, tokeneater is a callback

25

function to which the 5 fields described above are passed as 5 arguments,

26

each time a new token is found."""

27

28

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

29

__credits__ = \

30

'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'

31

32

import string, re

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

33

from codecs import BOM_UTF8, lookup

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

34

from lib2to3.pgen2.token import *

35

36

from . import token

37

__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",

38

"generate_tokens", "untokenize"]

39

del token

40

Benjamin Peterson

8d26b0b

2010-05-07 19:10:11 +0000

[diff] [blame]

try:

bytes

except NameError:

# Support bytes type in Python <= 2.5, so 2to3 turns itself into

45

# valid Python 3 code.

46

bytes = str

47

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

48

def group(*choices): return '(' + '|'.join(choices) + ')'

49

def any(*choices): return group(*choices) + '*'

50

def maybe(*choices): return group(*choices) + '?'

51

52

Whitespace = r'[ \f\t]*'

53

Comment = r'#[^\r\n]*'

54

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

55

Name = r'[a-zA-Z_]\w*'

56

Nevada Sanchez

a6e395d

2017-04-13 13:32:54 -0400

[diff] [blame]

57

Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'

58

Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'

59

Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'

60

Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

61

Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)

Nevada Sanchez

a6e395d

2017-04-13 13:32:54 -0400

[diff] [blame]

62

Exponent = r'[eE][-+]?\d+(?:_\d+)*'

63

Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)

64

Expfloat = r'\d+(?:_\d+)*' + Exponent

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

65

Floatnumber = group(Pointfloat, Expfloat)

Nevada Sanchez

a6e395d

2017-04-13 13:32:54 -0400

[diff] [blame]

66

Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

67

Number = group(Imagnumber, Floatnumber, Intnumber)

68

69

# Tail end of ' string.

70

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

71

# Tail end of " string.

72

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

73

# Tail end of ''' string.

74

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

75

# Tail end of """ string.

76

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame^]

77

_litprefix = r"(?:[uUrRbB]|[rR][bB]|[bBuU][rR])?"

78

Triple = group(_litprefix + "'''", _litprefix + '"""')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

79

# Single-line ' or " string.

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame^]

80

String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

81

_litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

82

83

# Because of leftmost-then-longest match semantics, be sure to put the

84

# longest operators first (e.g., if = came before ==, == would get

85

# recognized as two instances of =).

86

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",

87

r"//=?", r"->",

Benjamin Peterson

4ab92c8

2014-04-10 00:12:47 -0400

[diff] [blame]

88

r"[+\-*/%&@|^=<>]=?",

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

r"~")

Bracket = '[][(){}]'

Special = group(r'\r?\n', r'[:;.,`@]')

93

Funny = group(Operator, Bracket, Special)

94

95

PlainToken = group(Number, Funny, String, Name)

96

Token = Ignore + PlainToken

97

98

# First (or only) line of ' or " string.

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame^]

99

ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

100

group("'", r'\\\r?\n'),

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame^]

101

_litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

102

group('"', r'\\\r?\n'))

103

PseudoExtras = group(r'\\\r?\n', Comment, Triple)

104

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

105

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

106

tokenprog, pseudoprog, single3prog, double3prog = list(map(

107

re.compile, (Token, PseudoToken, Single3, Double3)))

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

108

endprogs = {"'": re.compile(Single), '"': re.compile(Double),

109

"'''": single3prog, '"""': double3prog,

110

"r'''": single3prog, 'r"""': double3prog,

111

"u'''": single3prog, 'u"""': double3prog,

112

"b'''": single3prog, 'b"""': double3prog,

113

"ur'''": single3prog, 'ur"""': double3prog,

114

"br'''": single3prog, 'br"""': double3prog,

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame^]

115

"rb'''": single3prog, 'rb"""': double3prog,

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

116

"R'''": single3prog, 'R"""': double3prog,

117

"U'''": single3prog, 'U"""': double3prog,

118

"B'''": single3prog, 'B"""': double3prog,

119

"uR'''": single3prog, 'uR"""': double3prog,

120

"Ur'''": single3prog, 'Ur"""': double3prog,

121

"UR'''": single3prog, 'UR"""': double3prog,

122

"bR'''": single3prog, 'bR"""': double3prog,

123

"Br'''": single3prog, 'Br"""': double3prog,

124

"BR'''": single3prog, 'BR"""': double3prog,

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame^]

125

"rB'''": single3prog, 'rB"""': double3prog,

126

"Rb'''": single3prog, 'Rb"""': double3prog,

127

"RB'''": single3prog, 'RB"""': double3prog,

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

128

'r': None, 'R': None,

129

'u': None, 'U': None,

130

'b': None, 'B': None}

131

132

triple_quoted = {}

133

for t in ("'''", '"""',

134

"r'''", 'r"""', "R'''", 'R"""',

135

"u'''", 'u"""', "U'''", 'U"""',

136

"b'''", 'b"""', "B'''", 'B"""',

137

"ur'''", 'ur"""', "Ur'''", 'Ur"""',

138

"uR'''", 'uR"""', "UR'''", 'UR"""',

139

"br'''", 'br"""', "Br'''", 'Br"""',

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame^]

140

"bR'''", 'bR"""', "BR'''", 'BR"""',

141

"rb'''", 'rb"""', "Rb'''", 'Rb"""',

142

"rB'''", 'rB"""', "RB'''", 'RB"""',):

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

triple_quoted[t] = t

single_quoted = {}

for t in ("'", '"',

"r'", 'r"', "R'", 'R"',

147

"u'", 'u"', "U'", 'U"',

148

"b'", 'b"', "B'", 'B"',

149

"ur'", 'ur"', "Ur'", 'Ur"',

150

"uR'", 'uR"', "UR'", 'UR"',

151

"br'", 'br"', "Br'", 'Br"',

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame^]

152

"bR'", 'bR"', "BR'", 'BR"',

153

"rb'", 'rb"', "Rb'", 'Rb"',

154

"rB'", 'rB"', "RB'", 'RB"',):

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

single_quoted[t] = t

tabsize = 8

class TokenError(Exception): pass

160

161

class StopTokenizing(Exception): pass

162

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

163

def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing

164

(srow, scol) = xxx_todo_changeme

165

(erow, ecol) = xxx_todo_changeme1

166

print("%d,%d-%d,%d:\t%s\t%s" % \

167

(srow, scol, erow, ecol, tok_name[type], repr(token)))

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

168

169

def tokenize(readline, tokeneater=printtoken):

170

"""

171

The tokenize() function accepts two parameters: one representing the

172

input stream, and one providing an output mechanism for tokenize().

173

174

The first parameter, readline, must be a callable object which provides

175

the same interface as the readline() method of built-in file objects.

176

Each call to the function should return one line of input as a string.

177

178

The second parameter, tokeneater, must also be a callable object. It is

179

called once for each token, with five arguments, corresponding to the

180

tuples generated by generate_tokens().

181

"""

182

try:

183

tokenize_loop(readline, tokeneater)

184

except StopTokenizing:

185

pass

186

187

# backwards compatible interface

188

def tokenize_loop(readline, tokeneater):

189

for token_info in generate_tokens(readline):

190

tokeneater(*token_info)

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

def add_whitespace(self, start):

200

row, col = start

201

assert row <= self.prev_row

202

col_offset = col - self.prev_col

203

if col_offset:

204

self.tokens.append(" " * col_offset)

205

206

def untokenize(self, iterable):

207

for t in iterable:

208

if len(t) == 2:

209

self.compat(t, iterable)

210

break

211

tok_type, token, start, end, line = t

212

self.add_whitespace(start)

213

self.tokens.append(token)

214

self.prev_row, self.prev_col = end

215

if tok_type in (NEWLINE, NL):

216

self.prev_row += 1

217

self.prev_col = 0

218

return "".join(self.tokens)

219

220

def compat(self, token, iterable):

221

startline = False

222

indents = []

223

toks_append = self.tokens.append

224

toknum, tokval = token

225

if toknum in (NAME, NUMBER):

226

tokval += ' '

227

if toknum in (NEWLINE, NL):

228

startline = True

229

for tok in iterable:

230

toknum, tokval = tok[:2]

231

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

232

if toknum in (NAME, NUMBER, ASYNC, AWAIT):

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

tokval += ' '

if toknum == INDENT:

indents.append(tokval)

237

continue

238

elif toknum == DEDENT:

239

indents.pop()

240

continue

241

elif toknum in (NEWLINE, NL):

242

startline = True

243

elif startline and indents:

244

toks_append(indents[-1])

startline = False

toks_append(tokval)

Serhiy Storchaka

2016-03-20 23:36:29 +0200

[diff] [blame]

248

cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

249

blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

250

Benjamin Peterson

d9af52b

2009-11-02 18:16:28 +0000

[diff] [blame]

251

def _get_normal_name(orig_enc):

252

"""Imitates get_normal_name in tokenizer.c."""

253

# Only care about the first 12 characters.

254

enc = orig_enc[:12].lower().replace("_", "-")

255

if enc == "utf-8" or enc.startswith("utf-8-"):

256

return "utf-8"

257

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

258

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

262

def detect_encoding(readline):

263

"""

264

The detect_encoding() function is used to detect the encoding that should

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

265

be used to decode a Python source file. It requires one argument, readline,

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

266

in the same way as the tokenize() generator.

267

268

It will call readline a maximum of twice, and return the encoding used

269

(as a string) and a list of any lines (left as bytes) it has read

270

in.

271

272

It detects the encoding from the presence of a utf-8 bom or an encoding

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

273

cookie as specified in pep-0263. If both a bom and a cookie are present, but

274

disagree, a SyntaxError will be raised. If the encoding cookie is an invalid

275

charset, raise a SyntaxError. Note that if a utf-8 bom is found,

276

'utf-8-sig' is returned.

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

277

278

If no encoding is specified, then the default of 'utf-8' will be returned.

279

"""

280

bom_found = False

281

encoding = None

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

282

default = 'utf-8'

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

Benjamin Peterson

8d26b0b

2010-05-07 19:10:11 +0000

[diff] [blame]

287

return bytes()

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

288

289

def find_cookie(line):

290

try:

291

line_string = line.decode('ascii')

292

except UnicodeDecodeError:

293

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

294

match = cookie_re.match(line_string)

295

if not match:

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

296

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

297

encoding = _get_normal_name(match.group(1))

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

298

try:

299

codec = lookup(encoding)

300

except LookupError:

301

# This behaviour mimics the Python interpreter

302

raise SyntaxError("unknown encoding: " + encoding)

303

Benjamin Peterson

2021100

2009-11-25 18:34:42 +0000

[diff] [blame]

304

if bom_found:

305

if codec.name != 'utf-8':

306

# This behaviour mimics the Python interpreter

307

raise SyntaxError('encoding problem: utf-8')

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

308

encoding += '-sig'

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

309

return encoding

310

311

first = read_or_stop()

312

if first.startswith(BOM_UTF8):

313

bom_found = True

314

first = first[3:]

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

315

default = 'utf-8-sig'

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

316

if not first:

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

317

return default, []

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

318

319

encoding = find_cookie(first)

320

if encoding:

321

return encoding, [first]

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

322

if not blank_re.match(first):

323

return default, [first]

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

324

325

second = read_or_stop()

326

if not second:

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

327

return default, [first]

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

328

329

encoding = find_cookie(second)

330

if encoding:

331

return encoding, [first, second]

332

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

333

return default, [first, second]

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

334

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

335

def untokenize(iterable):

336

"""Transform tokens back into Python source code.

337

338

Each element returned by the iterable must be a token sequence

339

with at least two elements, a token number and token value. If

340

only two tokens are passed, the resulting output is poor.

341

342

Round-trip invariant for full input:

343

Untokenized source will match input source exactly

344

345

Round-trip invariant for limited intput:

346

# Output text will tokenize the back to the input

347

t1 = [tok[:2] for tok in generate_tokens(f.readline)]

348

newcode = untokenize(t1)

349

readline = iter(newcode.splitlines(1)).next

350

t2 = [tok[:2] for tokin generate_tokens(readline)]

assert t1 == t2

"""

ut = Untokenizer()

return ut.untokenize(iterable)

355

356

def generate_tokens(readline):

357

"""

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

358

The generate_tokens() generator requires one argument, readline, which

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

359

must be a callable object which provides the same interface as the

360

readline() method of built-in file objects. Each call to the function

361

should return one line of input as a string. Alternately, readline

362

can be a callable function terminating with StopIteration:

363

readline = open(myfile).next # Example of alternate readline

364

365

The generator produces 5-tuples with these members: the token type; the

366

token string; a 2-tuple (srow, scol) of ints specifying the row and

367

column where the token begins in the source; a 2-tuple (erow, ecol) of

368

ints specifying the row and column where the token ends in the source;

369

and the line on which the token was found. The line passed is the

370

logical line; continuation lines are included.

371

"""

372

lnum = parenlev = continued = 0

373

namechars, numchars = string.ascii_letters + '_', '0123456789'

374

contstr, needcont = '', 0

contline = None

indents = [0]

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

378

# 'stashed' and 'async_*' are used for async/await parsing

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

379

stashed = None

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

380

async_def = False

381

async_def_indent = 0

382

async_def_nl = False

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

383

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

384

while 1: # loop over lines in stream

385

try:

386

line = readline()

387

except StopIteration:

388

line = ''

389

lnum = lnum + 1

390

pos, max = 0, len(line)

391

392

if contstr: # continued string

393

if not line:

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

394

raise TokenError("EOF in multi-line string", strstart)

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

395

endmatch = endprog.match(line)

396

if endmatch:

397

pos = end = endmatch.end(0)

398

yield (STRING, contstr + line[:end],

399

strstart, (lnum, end), contline + line)

400

contstr, needcont = '', 0

401

contline = None

402

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

403

yield (ERRORTOKEN, contstr + line,

404

strstart, (lnum, len(line)), contline)

contstr = ''

contline = None

continue

else:

contstr = contstr + line

410

contline = contline + line

411

continue

412

413

elif parenlev == 0 and not continued: # new statement

414

if not line: break

415

column = 0

416

while pos < max: # measure leading whitespace

417

if line[pos] == ' ': column = column + 1

Benjamin Peterson

d9af52b

2009-11-02 18:16:28 +0000

[diff] [blame]

418

elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

419

elif line[pos] == '\f': column = 0

else: break

pos = pos + 1

if pos == max: break

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

if stashed:

yield stashed

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

428

if line[pos] in '#\r\n': # skip comments or blank lines

429

if line[pos] == '#':

430

comment_token = line[pos:].rstrip('\r\n')

431

nl_pos = pos + len(comment_token)

432

yield (COMMENT, comment_token,

433

(lnum, pos), (lnum, pos + len(comment_token)), line)

434

yield (NL, line[nl_pos:],

435

(lnum, nl_pos), (lnum, len(line)), line)

436

else:

437

yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],

438

(lnum, pos), (lnum, len(line)), line)

439

continue

440

441

if column > indents[-1]: # count indents or dedents

442

indents.append(column)

443

yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

444

while column < indents[-1]:

445

if column not in indents:

446

raise IndentationError(

447

"unindent does not match any outer indentation level",

448

("<tokenize>", lnum, pos, line))

449

indents = indents[:-1]

Yury Selivanov

8fb307c

2015-07-22 13:33:45 +0300

[diff] [blame]

450

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

451

if async_def and async_def_indent >= indents[-1]:

452

async_def = False

453

async_def_nl = False

454

async_def_indent = 0

Yury Selivanov

8fb307c

2015-07-22 13:33:45 +0300

[diff] [blame]

455

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

456

yield (DEDENT, '', (lnum, pos), (lnum, pos), line)

457

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

458

if async_def and async_def_nl and async_def_indent >= indents[-1]:

async_def = False

async_def_nl = False

async_def_indent = 0

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

463

else: # continued statement

464

if not line:

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

465

raise TokenError("EOF in multi-line statement", (lnum, 0))

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

continued = 0

while pos < max:

pseudomatch = pseudoprog.match(line, pos)

470

if pseudomatch: # scan for tokens

471

start, end = pseudomatch.span(1)

472

spos, epos, pos = (lnum, start), (lnum, end), end

473

token, initial = line[start:end], line[start]

474

475

if initial in numchars or \

476

(initial == '.' and token != '.'): # ordinary number

477

yield (NUMBER, token, spos, epos, line)

478

elif initial in '\r\n':

479

newline = NEWLINE

480

if parenlev > 0:

481

newline = NL

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

482

elif async_def:

483

async_def_nl = True

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

484

if stashed:

485

yield stashed

486

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

487

yield (newline, token, spos, epos, line)

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

488

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

489

elif initial == '#':

490

assert not token.endswith("\n")

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

491

if stashed:

492

yield stashed

493

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

494

yield (COMMENT, token, spos, epos, line)

495

elif token in triple_quoted:

496

endprog = endprogs[token]

497

endmatch = endprog.match(line, pos)

498

if endmatch: # all on one line

499

pos = endmatch.end(0)

500

token = line[start:pos]

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

501

if stashed:

502

yield stashed

503

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

504

yield (STRING, token, spos, (lnum, pos), line)

505

else:

506

strstart = (lnum, start) # multiple lines

507

contstr = line[start:]

508

contline = line

509

break

510

elif initial in single_quoted or \

511

token[:2] in single_quoted or \

512

token[:3] in single_quoted:

513

if token[-1] == '\n': # continued string

514

strstart = (lnum, start)

515

endprog = (endprogs[initial] or endprogs[token[1]] or

516

endprogs[token[2]])

517

contstr, needcont = line[start:], 1

518

contline = line

519

break

520

else: # ordinary string

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

521

if stashed:

522

yield stashed

523

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

524

yield (STRING, token, spos, epos, line)

525

elif initial in namechars: # ordinary name

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

526

if token in ('async', 'await'):

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

527

if async_def:

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

528

yield (ASYNC if token == 'async' else AWAIT,

529

token, spos, epos, line)

530

continue

531

532

tok = (NAME, token, spos, epos, line)

533

if token == 'async' and not stashed:

stashed = tok

continue

if token == 'def':

if (stashed

and stashed[0] == NAME

540

and stashed[1] == 'async'):

541

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

542

async_def = True

543

async_def_indent = indents[-1]

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

544

545

yield (ASYNC, stashed[1],

546

stashed[2], stashed[3],

547

stashed[4])

548

stashed = None

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

if stashed:

yield stashed

stashed = None

yield tok

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

555

elif initial == '\\': # continued stmt

556

# This yield is new; needed for better idempotency:

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

557

if stashed:

558

yield stashed

559

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

560

yield (NL, token, spos, (lnum, pos), line)

561

continued = 1

562

else:

563

if initial in '([{': parenlev = parenlev + 1

564

elif initial in ')]}': parenlev = parenlev - 1

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

565

if stashed:

566

yield stashed

567

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

568

yield (OP, token, spos, epos, line)

569

else:

570

yield (ERRORTOKEN, line[pos],

571

(lnum, pos), (lnum, pos+1), line)

572

pos = pos + 1

573

Yury Selivanov